#############
# Functions # 
#############

# Find alignment quality
get_alignment_qual <- function(rate) {
  if (rate >= 85) {
    qual <- "high"
  } else if (rate >= 65) {
    qual <- "okay"
  } else {
    qual <- "low"
  }
  
  qual
}

# Create a PCA Plot
plot_pca <- function(vsd, group_by, color_palette = NULL){
  if(is.null(color_palette)){
    color_palette <- brewer.pal(length(levels(vsd[[group_by]])), "Set1")
    names(color_palette) <- levels(vsd[[group_by]])
  }
  pcaData <- plotPCA(vsd, intgroup = group_by, returnData = T)
    colnames(pcaData)[3] <- "group_by"
    percentVar <- round(100 * attr(pcaData, "percentVar"))
    pca_plot <- ggplot(data = pcaData,
                       mapping = aes(x = PC1,
                                     y = PC2,
                                     color = group_by)) +
    theme_classic() +
    geom_point(size = 3) +
    xlab(paste0("PC1: ", percentVar[1], "% variance")) +
    ylab(paste0("PC2: ", percentVar[2], "% variance")) +
    labs(color = group_by) +
    #coord_fixed() +
    scale_color_manual(values = color_palette)
return(pca_plot)
}


# Function to determine sample distances
get_distances <- function(vsd){
  sampleDists <- dist(t(assay(vsd)))
  sampleDistMatrix <- as.matrix(sampleDists)
  colnames(sampleDistMatrix) <- NULL
  colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
  dist_map <- pheatmap(sampleDistMatrix,
                       clustering_distance_rows=sampleDists,
                       clustering_distance_cols=sampleDists,
                       col=colors)
  return(dist_map)
}

# Function to return sig genes for a comparison
get_de <- function(object, column, var1, var2, write_csv = F,
                   p_value = 0.05, lfc = 0.5, lfc_shrink = TRUE,
                   output_dir){
  res <- DESeq2::results(object, contrast = c(column, var1, var2))
  if(lfc_shrink){
    res <- DESeq2::lfcShrink(object, contrast = c(column, var1, var2),
                             res = res, type = "ashr")
  }
  resOrdered <- res[order(res$pvalue), ]
  sigGenes <- resOrdered[!is.na(resOrdered$padj), ]
  sigGenes <- sigGenes[sigGenes$padj < p_value, ]
  sigGenes <- sigGenes[abs(sigGenes$log2FoldChange) > lfc, ]
  # Add columns for gene id and ens id
  sigGenes_colnames <- colnames(sigGenes)
  sigGenes$gene_name <- sub("_ENSMUSG.*", "", rownames(sigGenes))
  sigGenes$ens_id <- sub(".*_ENSMUSG", "ENSMUSG", rownames(sigGenes))
  # Place the new columns at the front
  sigGenes <- sigGenes[ , c("gene_name", "ens_id", sigGenes_colnames)]
  if(write_csv){
    file_name <- paste0(var1, "_vs_", var2, ".csv")
    write.csv(sigGenes,
              file = file.path(output_dir, "DE_files", file_name),
              row.names = FALSE)
  }
  return(list(DE_genes = sigGenes,
              DE_df = res))
}

# Function to make a heatmap
make_heatmap <- function(dds, de_genes, treatment, control, group,
                         print_genenames = FALSE, gene_identifier = "Gene_ID",
                         cluster_cols = FALSE, save_heatmap = TRUE,
                         output_dir = "/results/", plot_groups = "all",
                         color_test = NULL){
  # First make a "col data" data frame. This is just telling
  # pheatmap what you want to use to color the columns.
  df <- as.data.frame(colData(dds)[,c(group)])
  sample_info <- colData(dds)
  colnames(df) <- group
  rownames(df) <- rownames(colData(dds))
  # Pull out DE genes from the test we ran earlier, start with Gata6 KO
  genes <- rownames(de_genes)
  if(is.null(color_test)){
    color_test <- brewer.pal(length(levels(dds[[group]])), "Set1")
    names(color_test) <- levels(dds[[group]])
  }
  coloring <- list(color_test)
  names(coloring) <- group
  heatmap_df <- assay(vsd)[genes,]
  if(!("all" %in% plot_groups)){
    sample_info <- colData(dds)
    sample_plot <- sample_info[sample_info[[group]] %in% plot_groups, ]
    heatmap_df <- heatmap_df[ , rownames(sample_plot)]
  } else {
    sample_plot <- sample_info
  }
  # Heatmaps are always mean cenetered. Meaning the value shown is actually the 
  # expression value of the sample minus the mean. This command centers the
  # dataframe so it can be plotted. When it isn't centered it is very hard to
  # see any trends because all genes have very different expression levels.
  heatmap_scale <- t(scale(t(heatmap_df), scale = TRUE))
  palOut <- colorRampPalette(blueYellow)(256)
  if(!cluster_cols){
    if(!identical(colnames(heatmap_scale), rownames(sample_plot))){
      heatmap_scale <- heatmap_scale[ , rownames(sample_plot)]
    }
    # Order based on the comparison
    col_order <- c(
      grep(control, sample_plot[[group]]),
      grep(treatment, sample_plot[[group]]),
      grep((paste0(treatment, "|", control)), sample_plot[[group]],
           invert = TRUE)
    )
    heatmap_scale <- heatmap_scale[ , col_order]
  }
  if(print_genenames){
    gene_names <- rownames(heatmap_scale)
    if(gene_identifier == "Gene_ID"){
      gene_names <- sub("_ENSMUSG[0-9]*", "", gene_names)
    } else if (gene_identifier == "ENS_ID"){
      gene_names <- sub("*_ENSMUSG", "ENSMUSG", gene_names)
    }
    rownames(heatmap_scale) <- gene_names
  }
  # Here we make the heatmap
  heatmap <- pheatmap(heatmap_scale, cluster_rows = TRUE,
                      cluster_cols = cluster_cols,
                      show_rownames = print_genenames,
                      show_colnames = TRUE, annotation_col = df,
                      annotation_colors = coloring, color = blueYellow,
                      border_color = NA, clustering_method = "complete")
  
  if(save_heatmap){
    comparison <- paste0(control, "_vs_", treatment, ".pdf")
    pdf(file.path(output_dir, comparison), width = 10, height = 10)
    print(heatmap)
    dev.off()
  }
  
  return(heatmap)
}

# Run gprofiler separate positive and negative
run_gprofiler <- function(gene_table, pos_name, neg_name,
                          custom_bg = FALSE, 
                          correction_method = "gSCS",
                          exclude_iea = FALSE,
                          save_dir = NULL,
                          plot_dir = NULL){
  pos_sig_table <- gene_table[gene_table$log2FoldChange > 0, ]
  neg_sig_table <- gene_table[gene_table$log2FoldChange < 0, ]
  
  # Here we order by the log fold change so that "ordered_query" can be true
  pos_sig_table <- pos_sig_table[order(pos_sig_table$log2FoldChange),]
  neg_sig_table <- neg_sig_table[order(neg_sig_table$log2FoldChange),]
  pos_sig_genes <- rev(pos_sig_table$ens_id)
  neg_sig_genes <- neg_sig_table$ens_id
  pos_gost <- gost(query = pos_sig_genes,
                   organism = "mmusculus",
                   ordered_query = TRUE,
                   exclude_iea = exclude_iea,
                   user_threshold = 0.05,
                   correction_method = correction_method,
                   custom_bg = custom_bg)
  
  neg_gost <- gost(query = neg_sig_genes,
                   organism = "mmusculus",
                   ordered_query = TRUE,
                   exclude_iea = exclude_iea,
                   user_threshold = 0.05,
                   correction_method = correction_method,
                   custom_bg = custom_bg)
  
  sig_pos_res <- pos_gost$result[pos_gost$result$significant == TRUE, ]
  sig_neg_res <- neg_gost$result[neg_gost$result$significant == TRUE, ]
  
  if (!is.null(save_dir)){
    pos_save <- paste0(pos_name, "_from_", pos_name, "_vs_", neg_name, ".csv")
    neg_save <- paste0(neg_name, "_from_", pos_name, "_vs_", neg_name, ".csv")
    pos_save_rds <- paste0(pos_name, "_from_", pos_name, "_vs_",
                           neg_name, ".rds")
    neg_save_rds <- paste0(neg_name, "_from_", pos_name, "_vs_",
                           neg_name, ".rds")
    saveRDS(pos_gost, file = file.path(save_dir,pos_save_rds))
    saveRDS(neg_gost, file = file.path(save_dir, neg_save_rds))
    if(nrow(sig_pos_res) > 1){
      sig_pos_res_csv <- apply(sig_pos_res, 2, as.character)
      write.csv(sig_pos_res_csv, file = file.path(save_dir, pos_save))
    }
    if(nrow(sig_neg_res) > 1){
      sig_neg_res_csv <- apply(sig_neg_res, 2, as.character)
      write.csv(sig_neg_res_csv, file = file.path(save_dir, neg_save))
    }
  }
  if (!is.null(plot_dir)){
    save_name <- paste0(pos_name, "_vs_", neg_name, "_separate.pdf")
    
    # This opens up a pdf file. We will save many images into this file
    pdf(file.path(plot_dir, save_name))
    
    # These make the plots
    plots <- list()
    plots$C_GOBP <- gost_plots(sig_pos_res, "GO:BP", pos_name)
    plots$C_GOMF <- gost_plots(sig_pos_res, "GO:MF", pos_name)
    plots$C_GOCC <- gost_plots(sig_pos_res, "GO:CC", pos_name)
    plots$C_KEGG <- gost_plots(sig_pos_res, "KEGG", pos_name)
    plots$C_TF <- gost_plots(sig_pos_res, "TF", pos_name)
    plots$T_GOBP <- gost_plots(sig_neg_res, "GO:BP", neg_name)
    plots$T_GOMF <- gost_plots(sig_neg_res, "GO:MF", neg_name)
    plots$T_GOCC <- gost_plots(sig_neg_res, "GO:CC", neg_name)
    plots$T_KEGG <- gost_plots(sig_neg_res, "KEGG", neg_name)
    plots$T_TF <- gost_plots(sig_neg_res, "TF", neg_name)
    
    plots_list <- lapply(plots, function(x){
      if (!is.null(x)){
        plot(x)
      }
    })
    
    # This closes the pdf
    dev.off()
    

  }
  return_list <- list(pos_gost, neg_gost)
  names(return_list) <- c(pos_name, neg_name)
  return_list$plots <- plots_list
  return(return_list)
  
}

# Run gprofiler on all DE genes
run_gprofiler_all <- function(gene_table, pos_name, neg_name,
                             custom_bg = FALSE, 
                             correction_method = "gSCS",
                             exclude_iea = FALSE,
                             save_dir = NULL,
                             plot_dir = NULL){
  
  # Here we order by the log fold change so that "ordered_query" can be true
  gene_table <- gene_table[order(gene_table$log2FoldChange),]
  sig_genes <- rev(gene_table$ens_id)
  gost_res <- gost(query = sig_genes,
                   organism = "mmusculus",
                   ordered_query = TRUE,
                   exclude_iea = exclude_iea,
                   user_threshold = 0.05,
                   correction_method = correction_method,
                   custom_bg = custom_bg)
  
  sig_res <- gost_res$result[gost_res$result$significant == TRUE, ]

  if (!is.null(save_dir)){
    save_name <- paste0(pos_name, "_vs_", neg_name, ".csv")
    save_rds <- paste0(pos_name, "_vs_", neg_name, ".rds")
    saveRDS(gost_res, file = file.path(save_dir, save_rds))
    if(nrow(sig_res) > 1){
      sig_res_csv <- apply(sig_res, 2, as.character)
      write.csv(sig_res_csv, file = file.path(save_dir, save_name))
    }
  }
  if (!is.null(plot_dir)){
    save_name <- paste0(pos_name, "_vs_", neg_name, ".pdf")
    
    # This opens up a pdf file. We will save many images into this file
    pdf(file.path(plot_dir, save_name))
    
    # These make the plots
    plots <- list()
    plot_name <- paste0(pos_name, "_vs_", neg_name)
    plots$GOBP <- gost_plots(sig_res, "GO:BP", plot_name)
    plots$GOMF <- gost_plots(sig_res, "GO:MF", plot_name)
    plots$GOCC <- gost_plots(sig_res, "GO:CC", plot_name)
    plots$KEGG <- gost_plots(sig_res, "KEGG", plot_name)
    plots$TF <- gost_plots(sig_res, "TF", plot_name)
    
    plots_list <- lapply(plots, function(x){
      if (!is.null(x)){
        plot(x)
      }
    })
    
    # This closes the pdf
    dev.off()
    

  }
  return_list <- list(gost_res)
  names(return_list) <- c(plot_name)
  return_list$plots <- plots_list
  return(return_list)
  
}

gost_plots <- function(results_table, source, title){
  source_results <- results_table[grep(source, results_table$source), ]
  if (nrow(source_results) > 0) {
    source_results <- source_results[order(source_results$precision), ]
    source_results <- source_results[1:40, ]
    source_results$term_name <- factor(source_results$term_name, levels = 
                                   unique(source_results$term_name))
    source_results$log_padj <- -log10(source_results$p_value)
    go_plot <-ggplot(source_results, aes(x = precision,
                                         y = term_name,
                                         color = log_padj,
                                         size = intersection_size)) +
      geom_point() +
      theme_classic() +
      scale_size(name = "Intersection",
                 range(c(1,max(source_results$intersection_size)))) +
      viridis::scale_color_viridis() +
      theme(text = ggplot2::element_text(size = 10)) +
      ggtitle(paste0(source, ": ", title)) +
      labs(color = "-log10(p-value)") +
      xlab("Precision (proportion of genes)") +
      ylab("Term")

    
    return(go_plot)
  }
}

# Make gene plots
plot_genes <- function(gene_id, gene_id_list, deseq_obj,
                       intgroup, plot_ggplot = TRUE,
                       color = NULL, return_data = TRUE,
                       print = TRUE, save_path = NULL){
  # Locate the index of the gene of interest
  index <- grep(paste0("^", gene_id, "$"), gene_id_list)
  if(length(index) == 0){
    print(paste0(gene_id, " not in deseq object"))
    return(NULL)
  } else if(length(index == 1)) {
    counts_plot <- make_plots(index = index, deseq_obj = deseq_obj,
                              intgroup = intgroup, plot_ggplot = plot_ggplot,
                              color = color, return_data = return_data,
                              print = print, save_path = save_path)
    return(counts_plot)
  } else {
    # This is in case a gene has two ensembl values
    plot_list <- lapply(index, function(x) make_plots(index = x,
                                                     deseq_obj = deseq_obj,
                                                     intgroup = intgroup,
                                                     plot_ggplot = plot_ggplot,
                                                     color = color, 
                                                     return_data = return_data,
                                                     print = print,
                                                     save_path = save_path))
    return(plot_list)
  }
}
make_plots <- function(index, deseq_obj, intgroup, plot_ggplot,
                       color, return_data, print, save_path){
  gene_name <- rownames(deseq_obj)[index]
  counts_plot <- DESeq2::plotCounts(deseq_obj, gene = gene_name,
                                    intgroup = intgroup,
                                    returnData = TRUE)
  if(plot_ggplot){
  if(is.null(color)){
    color <- brewer.pal(length(levels(deseq_obj[[group_by]])), "Set1")
    names(color) <- levels(deseq_obj[[group_by]])
  }
    colnames(counts_plot) <- c("count", "Group")
    ggplot_counts_plot <- ggplot2::ggplot(counts_plot,
                                          ggplot2::aes(x = Group,
                                                       y = count)) +
      ggplot2::geom_point(ggplot2::aes(color = Group), size = 3) +
      ggplot2::scale_color_manual(values = color, name = "Group") +
      ggplot2::theme_classic() +
      ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45,
                                                vjust = 1,
                                                hjust=1)) +
      ggplot2::ggtitle(gene_name)
    if(print){
      print(ggplot_counts_plot)
    }
    if (!is.null(save_path)){
      file_name <- paste0("counts_plot_", gene_name, ".pdf")
      file_path <- file.path(save_path, file_name)
      ggplot2::ggsave(filename = file_path, plot = ggplot_counts_plot)
    }
    if(return_data){
      return(ggplot_counts_plot)
    }
  } else {
    return(counts_plot)
  }
}

################
# Theme colors #
################

# Colors for heatmap (from the ArchR package)
blueYellow <- c("#352A86", "#343DAE", "#0262E0", "#1389D2", "#2DB7A3",
    "#A5BE6A", "#F8BA43", "#F6DA23", "#F8FA0D")

# Colors for statistics
fastqc_colors <- c("#e31a1c", "#238443", "#ec7014")

star_colors <- c("#737373", "#8c6bb1", "#ec7014",
                 "#e31a1c", "#238443", "#225ea8")

Background

This documents the analysis of the effect of cytokine treatment on PTPN2 KO mice. This document aims to analyze differences between PTPN2 KO and WT mice in their response to cytokine treatment.

# Load in the files
###############
# Count table #
###############
# Read in the count table
count_table <- read.table(params$counts, sep = "\t", row.names = 1, header = T)

################
# Sample table #
################

# Read in the sample table
sample_table <- read.table(params$sample_info, sep = ",", header = T,
                           row.names = 1)

# Ensure the count table and sample table are in the same order
count_table <- count_table[ , rownames(sample_table)]

########
# Star #
########

# Read in star stats
star_stats <- read.table(params$star_stats, sep = "\t", row.names = 1,
                         header = T)

# Filter for mapping % and total read counts
star_stats <- star_stats[grepl(
  "%|Uniquely mapped reads|mapped to multiple loci",
  rownames(star_stats)), ]

star_stats <- star_stats[!grepl(
  "^Mismatch|chimeric", rownames(star_stats)), ]

# Calculate median size
star_stats_median <- star_stats[grepl(
  "Uniquely mapped reads", rownames(star_stats)), ]

median_size <- median(as.numeric(
  star_stats_median["Uniquely mapped reads number", ])) / 1000000

median_rate <- median(as.numeric(str_remove(
  star_stats_median["Uniquely mapped reads %", ], "%")))

alignment_qual <- get_alignment_qual(median_rate)

# Pull out all sample names
samples <- colnames(star_stats)

# Make a column of metrics
star_stats$metric <- rownames(star_stats)

# Make into long form for ggplot
star_stats_long <- gather(star_stats, key = "Sample", value = "value",
                          all_of(samples))


# Add column for value type
star_stats_long <- star_stats_long %>%
  mutate(
    value = as.numeric(str_remove(value, "%")),
    val_type = ifelse(grepl("%", metric),
                      "pct", "int")
  )

# Pull out read counts
# Calculate median library size
read_counts <- star_stats_long %>%
  filter(grepl("mapped reads number", metric)) %>%
  mutate(value = round(value / 1000000)) %>%
  mutate(value = str_c(value, " million"))

##########
# FastQC #
##########

# Load in fastqc results
fastqc_summary <- read.table(params$fastqc, sep = "\t")

# Change the colnames
colnames(fastqc_summary) <- c("Result", "Test", "Sample")

# Update sample names
fastqc_summary$Sample <- sub("_S[0-9]*_L[0-9]*_", "", fastqc_summary$Sample)
fastqc_summary$Sample <- sub("_001.fastq.gz", "", fastqc_summary$Sample)

##############
# Set colors #
##############
sample_table[[params$sample_column]] <- 
  factor(sample_table[[params$sample_column]])
sample_colors <- brewer.pal(length(
  levels(sample_table[[params$sample_column]])), "Set1")
names(sample_colors) <- levels(sample_table[[params$sample_column]])

Quality Control

FastQC Summary

FastQC was used to assess the quality of each fastq file. A summary of the results is shown below, the overall library quality was ****.

fastqc_plot <- ggplot(data = fastqc_summary,
                      mapping = aes(x = Sample,
                                    y = Test,
                                    fill = Result)) +
  geom_tile(color = "white", size = 0.5) +
  scale_fill_manual(values = fastqc_colors) +
  theme_classic() +
  theme(
    legend.title = element_blank(),
    legend.text  = element_text(size = 8, color = "black"),
    axis.title   = element_blank(),
    axis.text    = element_text(size = 8, color = "black"),
    axis.text.x  = element_text(angle = 90, hjust = 1, vjust = 0.5)
  )

fastqc_plot

Alignment Summary

Reads were aligned to the GRCm38 genome using the STAR RNA-seq aligner. A summary of the results is shown below, the library size is displayed on each bar. The median library size was 47.310542 million reads and the median alignment rate was 67.83. Overall, the alignment rate was okay.

# Plot alignment stats
star_stats_long_pct <- star_stats_long %>%
  filter(val_type == "pct") %>%
  
  # Add median rates to metric label
  mutate(
    metric = str_remove(metric, "% of reads | reads %"),
    metric = str_to_sentence(metric)
  ) %>%
  arrange(value) %>%
  mutate(metric = fct_inorder(metric))
  
# Plot STAR alignment stats
star_plot <- ggplot(data = star_stats_long_pct,
                    mapping = aes(x = Sample,
                                  y = value,
                                  fill = metric)) +
  geom_bar(stat = "identity", color = "white", size = 0.5) +
  
  # Label bars with library size
  geom_text(
    data = read_counts, 
    aes(Sample, 50, label = value),
    show.legend = F,
    inherit.aes = F,
    color = "white", 
    angle = 90
  ) +
  
  scale_fill_manual(
    values = star_colors,
    guide  = guide_legend(reverse = T)
  ) +
  scale_y_continuous(labels = function(x) {str_c(x, "%")}) +
  theme_classic() +
  theme(
    legend.title = element_blank(),
    legend.text  = element_text(size = 8, color = "black"),
    axis.title   = element_blank(),
    axis.text    = element_text(size = 8, color = "black"),
    axis.text.x  = element_text(angle = 90, hjust = 1, vjust = 0.5)
  )

star_plot

PCA

Principal component analysis was performed to assess similarities between samples. Biological replicates were plotted for each experimental condition. When replicates cluster close together, this suggests that overall gene expression patterns are fairly reproducible.

# This makes a DESeq2 object where the count data is our count matrix and the colData is our sample data. I am currently making the design based on "group" but we can change this if necessary
dds <- DESeqDataSetFromMatrix(countData = count_table,
                              colData = sample_table,
                              design = formula(paste("~",
                                                     params$sample_column)))
# Here we get rid of all the genes that are not expressed
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)
# This transforms the counts to have constant variance. This helps ensue that highly or lowly expressed genes will not contribute too much for the PCA or clustering analysis
vsd <- vst(dds, blind = F)

plot_pca(vsd, group_by = params$sample_column, color_palette = sample_colors)

Sample Distances

Correlations between samples was determined to further assess similarities between samples. Biological replicates should cluster together. Samples that are more similar to each other have values closer to 0.

# This determines distance between samples.
get_distances(vsd)

Differential Expression Analysis

WT_NoTreatment_vs_PTPN2_KO_NoTreatment

# Extract DE genes
sig_genes <- get_de(dds, params$sample_column, control, treatment,
                    write_csv = T, p_value = params$DE_alpha,
                    lfc = 0, output_dir = params$output_dir,
                    lfc_shrink = T)

sig_genes_high_lfc <- get_de(dds, params$sample_column, control, treatment,
                             write_csv = F, p_value = params$DE_alpha,
                             lfc = params$DE_lfc,
                             lfc_shrink = T)

DE_genes_n <- nrow(sig_genes$DE_genes)

DEseq2 analysis

112 differentially expressed genes were identified using the DESeq2 package.

Differentially expressed genes are labeled on the MA plot in blue.

plotMA(sig_genes$DE_df, main = paste0(control, " vs ", treatment))

Heatmaps

Heatmap of DE genes across all samples

heatmap_dir <- file.path(params$output_dir, "images", "heatmaps")
heatmap <- make_heatmap(dds = dds,
                        de_genes = sig_genes$DE_genes,
                        treatment = treatment,
                        control = control,
                        group = params$sample_column,
                        print_genenames = FALSE,
                        cluster_cols = FALSE,
                        save_heatmap = TRUE,
                        output_dir = heatmap_dir,
                        color_test = sample_colors)
print(heatmap)

Heatmap of DE genes across only the samples used in the comparision

heatmap_dir <- file.path(params$output_dir, "images", "heatmaps")
heatmap <- make_heatmap(dds = dds,
                        de_genes = sig_genes$DE_genes,
                        treatment = treatment,
                        control = control,
                        group = params$sample_column,
                        print_genenames = FALSE,
                        cluster_cols = FALSE,
                        save_heatmap = FALSE,
                        plot_groups = c(treatment, control),
                        color_test = sample_colors)
print(heatmap)

Gene Set Enrichment

For Gene Set Enrichment, I used gprofiler2 It uses a hypergeometric test to determine overrepresentation of genes from different categories.

all_gene_ens <- sub(".*_ENSMUSG", "ENSMUSG", rownames(dds))

result <- run_gprofiler_all(gene_table = sig_genes$DE_genes,
                            pos_name = control,
                            neg_name = treatment,
                            custom_bg = all_gene_ens,
                            save_dir = file.path(params$output_dir, "GSE_files"),
                            plot_dir = file.path(params$output_dir, "images",
                                                 "GSE"))

plots <- result$plots

plot_grid(plots$GOBP, plots$GOMF, plots$GOCC, plots$KEGG,
          nrow = 4,
          ncol = 1,
          align = "v",
          axis = "tb")

WT_NoTreatment_vs_WT_Cytokine_Treatment

# Extract DE genes
sig_genes <- get_de(dds, params$sample_column, control, treatment,
                    write_csv = T, p_value = params$DE_alpha,
                    lfc = 0, output_dir = params$output_dir,
                    lfc_shrink = T)

sig_genes_high_lfc <- get_de(dds, params$sample_column, control, treatment,
                             write_csv = F, p_value = params$DE_alpha,
                             lfc = params$DE_lfc,
                             lfc_shrink = T)

DE_genes_n <- nrow(sig_genes$DE_genes)

DEseq2 analysis

12,390 differentially expressed genes were identified using the DESeq2 package.

Differentially expressed genes are labeled on the MA plot in blue.

plotMA(sig_genes$DE_df, main = paste0(control, " vs ", treatment))

Heatmaps

Heatmap of DE genes across all samples

heatmap_dir <- file.path(params$output_dir, "images", "heatmaps")
heatmap <- make_heatmap(dds = dds,
                        de_genes = sig_genes$DE_genes,
                        treatment = treatment,
                        control = control,
                        group = params$sample_column,
                        print_genenames = FALSE,
                        cluster_cols = FALSE,
                        save_heatmap = TRUE,
                        output_dir = heatmap_dir,
                        color_test = sample_colors)
print(heatmap)

Heatmap of DE genes across only the samples used in the comparision

heatmap_dir <- file.path(params$output_dir, "images", "heatmaps")
heatmap <- make_heatmap(dds = dds,
                        de_genes = sig_genes$DE_genes,
                        treatment = treatment,
                        control = control,
                        group = params$sample_column,
                        print_genenames = FALSE,
                        cluster_cols = FALSE,
                        save_heatmap = FALSE,
                        plot_groups = c(treatment, control),
                        color_test = sample_colors)
print(heatmap)

Gene Set Enrichment

For Gene Set Enrichment, I used gprofiler2 It uses a hypergeometric test to determine overrepresentation of genes from different categories.

all_gene_ens <- sub(".*_ENSMUSG", "ENSMUSG", rownames(dds))

result <- run_gprofiler_all(gene_table = sig_genes$DE_genes,
                            pos_name = control,
                            neg_name = treatment,
                            custom_bg = all_gene_ens,
                            save_dir = file.path(params$output_dir, "GSE_files"),
                            plot_dir = file.path(params$output_dir, "images",
                                                 "GSE"))

plots <- result$plots

plot_grid(plots$GOBP, plots$GOMF, plots$GOCC, plots$KEGG,
          nrow = 4,
          ncol = 1,
          align = "v",
          axis = "tb")

PTPN2_KO_NoTreatment_vs_PTPN2_KO_Cytokine_Treatment

# Extract DE genes
sig_genes <- get_de(dds, params$sample_column, control, treatment,
                    write_csv = T, p_value = params$DE_alpha,
                    lfc = 0, output_dir = params$output_dir,
                    lfc_shrink = T)

sig_genes_high_lfc <- get_de(dds, params$sample_column, control, treatment,
                             write_csv = F, p_value = params$DE_alpha,
                             lfc = params$DE_lfc,
                             lfc_shrink = T)

DE_genes_n <- nrow(sig_genes$DE_genes)

DEseq2 analysis

12,853 differentially expressed genes were identified using the DESeq2 package.

Differentially expressed genes are labeled on the MA plot in blue.

plotMA(sig_genes$DE_df, main = paste0(control, " vs ", treatment))

Heatmaps

Heatmap of DE genes across all samples

heatmap_dir <- file.path(params$output_dir, "images", "heatmaps")
heatmap <- make_heatmap(dds = dds,
                        de_genes = sig_genes$DE_genes,
                        treatment = treatment,
                        control = control,
                        group = params$sample_column,
                        print_genenames = FALSE,
                        cluster_cols = FALSE,
                        save_heatmap = TRUE,
                        output_dir = heatmap_dir,
                        color_test = sample_colors)
print(heatmap)

Heatmap of DE genes across only the samples used in the comparision

heatmap_dir <- file.path(params$output_dir, "images", "heatmaps")
heatmap <- make_heatmap(dds = dds,
                        de_genes = sig_genes$DE_genes,
                        treatment = treatment,
                        control = control,
                        group = params$sample_column,
                        print_genenames = FALSE,
                        cluster_cols = FALSE,
                        save_heatmap = FALSE,
                        plot_groups = c(treatment, control),
                        color_test = sample_colors)
print(heatmap)

Gene Set Enrichment

For Gene Set Enrichment, I used gprofiler2 It uses a hypergeometric test to determine overrepresentation of genes from different categories.

all_gene_ens <- sub(".*_ENSMUSG", "ENSMUSG", rownames(dds))

result <- run_gprofiler_all(gene_table = sig_genes$DE_genes,
                            pos_name = control,
                            neg_name = treatment,
                            custom_bg = all_gene_ens,
                            save_dir = file.path(params$output_dir, "GSE_files"),
                            plot_dir = file.path(params$output_dir, "images",
                                                 "GSE"))

plots <- result$plots

plot_grid(plots$GOBP, plots$GOMF, plots$GOCC, plots$KEGG,
          nrow = 4,
          ncol = 1,
          align = "v",
          axis = "tb")

WT_Cytokine_Treatment_vs_PTPN2_KO_Cytokine_Treatment

# Extract DE genes
sig_genes <- get_de(dds, params$sample_column, control, treatment,
                    write_csv = T, p_value = params$DE_alpha,
                    lfc = 0, output_dir = params$output_dir,
                    lfc_shrink = T)

sig_genes_high_lfc <- get_de(dds, params$sample_column, control, treatment,
                             write_csv = F, p_value = params$DE_alpha,
                             lfc = params$DE_lfc,
                             lfc_shrink = T)

DE_genes_n <- nrow(sig_genes$DE_genes)

DEseq2 analysis

631 differentially expressed genes were identified using the DESeq2 package.

Differentially expressed genes are labeled on the MA plot in blue.

plotMA(sig_genes$DE_df, main = paste0(control, " vs ", treatment))

Heatmaps

Heatmap of DE genes across all samples

heatmap_dir <- file.path(params$output_dir, "images", "heatmaps")
heatmap <- make_heatmap(dds = dds,
                        de_genes = sig_genes$DE_genes,
                        treatment = treatment,
                        control = control,
                        group = params$sample_column,
                        print_genenames = FALSE,
                        cluster_cols = FALSE,
                        save_heatmap = TRUE,
                        output_dir = heatmap_dir,
                        color_test = sample_colors)
print(heatmap)

Heatmap of DE genes across only the samples used in the comparision

heatmap_dir <- file.path(params$output_dir, "images", "heatmaps")
heatmap <- make_heatmap(dds = dds,
                        de_genes = sig_genes$DE_genes,
                        treatment = treatment,
                        control = control,
                        group = params$sample_column,
                        print_genenames = FALSE,
                        cluster_cols = FALSE,
                        save_heatmap = FALSE,
                        plot_groups = c(treatment, control),
                        color_test = sample_colors)
print(heatmap)

Gene Set Enrichment

For Gene Set Enrichment, I used gprofiler2 It uses a hypergeometric test to determine overrepresentation of genes from different categories.

all_gene_ens <- sub(".*_ENSMUSG", "ENSMUSG", rownames(dds))

result <- run_gprofiler_all(gene_table = sig_genes$DE_genes,
                            pos_name = control,
                            neg_name = treatment,
                            custom_bg = all_gene_ens,
                            save_dir = file.path(params$output_dir, "GSE_files"),
                            plot_dir = file.path(params$output_dir, "images",
                                                 "GSE"))

plots <- result$plots

plot_grid(plots$GOBP, plots$GOMF, plots$GOCC, plots$KEGG,
          nrow = 4,
          ncol = 1,
          align = "v",
          axis = "tb")

Gene plots

Gene plots show the normalized expression of key genes across all samples. We can add in any genes that you would like to see in this format.

Ins1

all_gene_names <- rownames(dds)
all_gene_ids <- sub("_ENSMUSG.*", "", all_gene_names)
plot <- plot_genes(gene_id = gene,
                   gene_id_list = all_gene_ids,
                   deseq_obj = dds,
                   intgroup = params$sample_column,
                   plot_ggplot = TRUE,
                   color = sample_colors,
                   return_data = FALSE,
                   print = TRUE,
                   save_path = file.path(params$output_dir, "images", "gene_plots"))

Ins2

all_gene_names <- rownames(dds)
all_gene_ids <- sub("_ENSMUSG.*", "", all_gene_names)
plot <- plot_genes(gene_id = gene,
                   gene_id_list = all_gene_ids,
                   deseq_obj = dds,
                   intgroup = params$sample_column,
                   plot_ggplot = TRUE,
                   color = sample_colors,
                   return_data = FALSE,
                   print = TRUE,
                   save_path = file.path(params$output_dir, "images", "gene_plots"))

Gbp2

all_gene_names <- rownames(dds)
all_gene_ids <- sub("_ENSMUSG.*", "", all_gene_names)
plot <- plot_genes(gene_id = gene,
                   gene_id_list = all_gene_ids,
                   deseq_obj = dds,
                   intgroup = params$sample_column,
                   plot_ggplot = TRUE,
                   color = sample_colors,
                   return_data = FALSE,
                   print = TRUE,
                   save_path = file.path(params$output_dir, "images", "gene_plots"))

Gbp5

all_gene_names <- rownames(dds)
all_gene_ids <- sub("_ENSMUSG.*", "", all_gene_names)
plot <- plot_genes(gene_id = gene,
                   gene_id_list = all_gene_ids,
                   deseq_obj = dds,
                   intgroup = params$sample_column,
                   plot_ggplot = TRUE,
                   color = sample_colors,
                   return_data = FALSE,
                   print = TRUE,
                   save_path = file.path(params$output_dir, "images", "gene_plots"))

Cxcl1

all_gene_names <- rownames(dds)
all_gene_ids <- sub("_ENSMUSG.*", "", all_gene_names)
plot <- plot_genes(gene_id = gene,
                   gene_id_list = all_gene_ids,
                   deseq_obj = dds,
                   intgroup = params$sample_column,
                   plot_ggplot = TRUE,
                   color = sample_colors,
                   return_data = FALSE,
                   print = TRUE,
                   save_path = file.path(params$output_dir, "images", "gene_plots"))

Pomc

all_gene_names <- rownames(dds)
all_gene_ids <- sub("_ENSMUSG.*", "", all_gene_names)
plot <- plot_genes(gene_id = gene,
                   gene_id_list = all_gene_ids,
                   deseq_obj = dds,
                   intgroup = params$sample_column,
                   plot_ggplot = TRUE,
                   color = sample_colors,
                   return_data = FALSE,
                   print = TRUE,
                   save_path = file.path(params$output_dir, "images", "gene_plots"))

Session Info

sessionInfo()
R version 4.0.3 (2020-10-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] org.Hs.eg.db_3.12.0         AnnotationDbi_1.52.0       
 [3] KEGGREST_1.30.1             pathview_1.30.1            
 [5] DESeq2_1.30.0               SummarizedExperiment_1.20.0
 [7] Biobase_2.50.0              MatrixGenerics_1.2.0       
 [9] matrixStats_0.57.0          GenomicRanges_1.42.0       
[11] GenomeInfoDb_1.26.2         IRanges_2.24.1             
[13] S4Vectors_0.28.1            BiocGenerics_0.36.0        
[15] apeglm_1.12.0               ashr_2.2-47                
[17] RColorBrewer_1.1-2          viridis_0.5.1              
[19] viridisLite_0.3.0           gprofiler2_0.2.0           
[21] devtools_2.3.2              usethis_2.0.0              
[23] ggrepel_0.9.1               pheatmap_1.0.12            
[25] Matrix_1.3-2                forcats_0.5.0              
[27] stringr_1.4.0               dplyr_1.0.3                
[29] purrr_0.3.4                 readr_1.4.0                
[31] tidyr_1.1.2                 tibble_3.0.5               
[33] ggplot2_3.3.3               tidyverse_1.3.0            
[35] cowplot_1.1.1               BiocManager_1.30.10        
[37] knitr_1.30                 

loaded via a namespace (and not attached):
  [1] readxl_1.3.1           backports_1.2.1        plyr_1.8.6            
  [4] lazyeval_0.2.2         splines_4.0.3          BiocParallel_1.24.1   
  [7] digest_0.6.27          invgamma_1.1           htmltools_0.5.1       
 [10] SQUAREM_2021.1         fansi_0.4.2            magrittr_2.0.1        
 [13] memoise_1.1.0          remotes_2.2.0          Biostrings_2.58.0     
 [16] annotate_1.68.0        modelr_0.1.8           bdsmatrix_1.3-4       
 [19] prettyunits_1.1.1      colorspace_2.0-0       blob_1.2.1            
 [22] rvest_0.3.6            haven_2.3.1            xfun_0.20             
 [25] callr_3.5.1            crayon_1.3.4           RCurl_1.98-1.2        
 [28] jsonlite_1.7.2         graph_1.68.0           genefilter_1.72.0     
 [31] survival_3.2-7         glue_1.4.2             gtable_0.3.0          
 [34] zlibbioc_1.36.0        XVector_0.30.0         DelayedArray_0.16.0   
 [37] pkgbuild_1.2.0         Rgraphviz_2.34.0       scales_1.1.1          
 [40] mvtnorm_1.1-1          DBI_1.1.1              Rcpp_1.0.6            
 [43] xtable_1.8-4           emdbook_1.3.12         bit_4.0.4             
 [46] truncnorm_1.0-8        htmlwidgets_1.5.3      httr_1.4.2            
 [49] ellipsis_0.3.1         farver_2.0.3           pkgconfig_2.0.3       
 [52] XML_3.99-0.5           dbplyr_2.0.0           locfit_1.5-9.4        
 [55] labeling_0.4.2         tidyselect_1.1.0       rlang_0.4.10          
 [58] munsell_0.5.0          cellranger_1.1.0       tools_4.0.3           
 [61] cli_2.2.0              generics_0.1.0         RSQLite_2.2.2         
 [64] broom_0.7.3            evaluate_0.14          yaml_2.2.1            
 [67] processx_3.4.5         bit64_4.0.5            fs_1.5.0              
 [70] KEGGgraph_1.50.0       xml2_1.3.2             compiler_4.0.3        
 [73] rstudioapi_0.13        png_0.1-7              plotly_4.9.3          
 [76] testthat_3.0.1         reprex_0.3.0           geneplotter_1.68.0    
 [79] stringi_1.5.3          ps_1.5.0               desc_1.2.0            
 [82] lattice_0.20-41        vctrs_0.3.6            pillar_1.4.7          
 [85] lifecycle_0.2.0        data.table_1.13.6      bitops_1.0-6          
 [88] irlba_2.3.3            R6_2.5.0               gridExtra_2.3         
 [91] sessioninfo_1.1.1      MASS_7.3-53            assertthat_0.2.1      
 [94] pkgload_1.1.0          rprojroot_2.0.2        withr_2.4.0           
 [97] GenomeInfoDbData_1.2.4 hms_1.0.0              grid_4.0.3            
[100] coda_0.19-4            rmarkdown_2.6          mixsqp_0.3-43         
[103] bbmle_1.0.23.1         numDeriv_2016.8-1.1    lubridate_1.7.9.2